Make some paper figures¶

The next cell is tagged parameters for papermill parameterization:

In [1]:
# tagged parameters for `papermill`
In [2]:
# Parameters
params = {
    "func_scores_293T-Mxra8-E3E2-B-241028-func-1": "results/func_scores/293T-Mxra8-E3E2-B-241028-func-1_func_scores.csv",
    "func_scores_293T-Mxra8-E3E2-B-241028-func-2": "results/func_scores/293T-Mxra8-E3E2-B-241028-func-2_func_scores.csv",
    "func_scores_293T-TIM1-E3E2-B-241028-func-1": "results/func_scores/293T-TIM1-E3E2-B-241028-func-1_func_scores.csv",
    "func_scores_293T-TIM1-E3E2-B-241028-func-2": "results/func_scores/293T-TIM1-E3E2-B-241028-func-2_func_scores.csv",
    "func_scores_293T-Mxra8-6KE1-B-241028-func-1": "results/func_scores/293T-Mxra8-6KE1-B-241028-func-1_func_scores.csv",
    "func_scores_293T-Mxra8-6KE1-B-241028-func-2": "results/func_scores/293T-Mxra8-6KE1-B-241028-func-2_func_scores.csv",
    "func_scores_293T-TIM1-6KE1-B-241028-func-1": "results/func_scores/293T-TIM1-6KE1-B-241028-func-1_func_scores.csv",
    "func_scores_293T-TIM1-6KE1-B-241028-func-2": "results/func_scores/293T-TIM1-6KE1-B-241028-func-2_func_scores.csv",
    "func_scores_293T-Mxra8-E3E2-A-241113-func-1": "results/func_scores/293T-Mxra8-E3E2-A-241113-func-1_func_scores.csv",
    "func_scores_293T-Mxra8-E3E2-A-241113-func-2": "results/func_scores/293T-Mxra8-E3E2-A-241113-func-2_func_scores.csv",
    "func_scores_293T-TIM1-E3E2-A-241113-func-1": "results/func_scores/293T-TIM1-E3E2-A-241113-func-1_func_scores.csv",
    "func_scores_293T-TIM1-E3E2-A-241113-func-2": "results/func_scores/293T-TIM1-E3E2-A-241113-func-2_func_scores.csv",
    "func_scores_293T-Mxra8-6KE1-A-241113-func-1": "results/func_scores/293T-Mxra8-6KE1-A-241113-func-1_func_scores.csv",
    "func_scores_293T-Mxra8-6KE1-A-241113-func-2": "results/func_scores/293T-Mxra8-6KE1-A-241113-func-2_func_scores.csv",
    "func_scores_293T-TIM1-6KE1-A-241113-func-1": "results/func_scores/293T-TIM1-6KE1-A-241113-func-1_func_scores.csv",
    "func_scores_293T-TIM1-6KE1-A-241113-func-2": "results/func_scores/293T-TIM1-6KE1-A-241113-func-2_func_scores.csv",
    "func_scores_C636-E3E2-B-241122-func-1": "results/func_scores/C636-E3E2-B-241122-func-1_func_scores.csv",
    "func_scores_C636-E3E2-B-241122-func-2": "results/func_scores/C636-E3E2-B-241122-func-2_func_scores.csv",
    "func_scores_C636-6KE1-B-241122-func-1": "results/func_scores/C636-6KE1-B-241122-func-1_func_scores.csv",
    "func_scores_C636-6KE1-B-241122-func-2": "results/func_scores/C636-6KE1-B-241122-func-2_func_scores.csv",
    "func_scores_C636-E3E2-A-241212-func-1": "results/func_scores/C636-E3E2-A-241212-func-1_func_scores.csv",
    "func_scores_C636-E3E2-A-241212-func-2": "results/func_scores/C636-E3E2-A-241212-func-2_func_scores.csv",
    "func_scores_C636-6KE1-A-241212-func-1": "results/func_scores/C636-6KE1-A-241212-func-1_func_scores.csv",
    "func_scores_C636-6KE1-A-241212-func-2": "results/func_scores/C636-6KE1-A-241212-func-2_func_scores.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-A-241113-func-1": "results/func_effects/by_selection/293T-Mxra8-E3E2-A-241113-func-1_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-A-241113-func-2": "results/func_effects/by_selection/293T-Mxra8-E3E2-A-241113-func-2_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-B-241028-func-1": "results/func_effects/by_selection/293T-Mxra8-E3E2-B-241028-func-1_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-B-241028-func-2": "results/func_effects/by_selection/293T-Mxra8-E3E2-B-241028-func-2_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-A-241113-func-1": "results/func_effects/by_selection/293T-Mxra8-6KE1-A-241113-func-1_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-A-241113-func-2": "results/func_effects/by_selection/293T-Mxra8-6KE1-A-241113-func-2_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-B-241028-func-1": "results/func_effects/by_selection/293T-Mxra8-6KE1-B-241028-func-1_func_effects.csv",
    "func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-B-241028-func-2": "results/func_effects/by_selection/293T-Mxra8-6KE1-B-241028-func-2_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-E3E2-A-241113-func-1": "results/func_effects/by_selection/293T-TIM1-E3E2-A-241113-func-1_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-E3E2-A-241113-func-2": "results/func_effects/by_selection/293T-TIM1-E3E2-A-241113-func-2_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-E3E2-B-241028-func-1": "results/func_effects/by_selection/293T-TIM1-E3E2-B-241028-func-1_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-E3E2-B-241028-func-2": "results/func_effects/by_selection/293T-TIM1-E3E2-B-241028-func-2_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-6KE1-A-241113-func-1": "results/func_effects/by_selection/293T-TIM1-6KE1-A-241113-func-1_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-6KE1-A-241113-func-2": "results/func_effects/by_selection/293T-TIM1-6KE1-A-241113-func-2_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-6KE1-B-241028-func-1": "results/func_effects/by_selection/293T-TIM1-6KE1-B-241028-func-1_func_effects.csv",
    "func_effects_293T-TIM1_entry_293T-TIM1-6KE1-B-241028-func-2": "results/func_effects/by_selection/293T-TIM1-6KE1-B-241028-func-2_func_effects.csv",
    "func_effects_C636_entry_C636-E3E2-B-241122-func-1": "results/func_effects/by_selection/C636-E3E2-B-241122-func-1_func_effects.csv",
    "func_effects_C636_entry_C636-E3E2-B-241122-func-2": "results/func_effects/by_selection/C636-E3E2-B-241122-func-2_func_effects.csv",
    "func_effects_C636_entry_C636-6KE1-B-241122-func-1": "results/func_effects/by_selection/C636-6KE1-B-241122-func-1_func_effects.csv",
    "func_effects_C636_entry_C636-6KE1-B-241122-func-2": "results/func_effects/by_selection/C636-6KE1-B-241122-func-2_func_effects.csv",
    "func_effects_C636_entry_C636-E3E2-A-241212-func-1": "results/func_effects/by_selection/C636-E3E2-A-241212-func-1_func_effects.csv",
    "func_effects_C636_entry_C636-E3E2-A-241212-func-2": "results/func_effects/by_selection/C636-E3E2-A-241212-func-2_func_effects.csv",
    "func_effects_C636_entry_C636-6KE1-A-241212-func-1": "results/func_effects/by_selection/C636-6KE1-A-241212-func-1_func_effects.csv",
    "func_effects_C636_entry_C636-6KE1-A-241212-func-2": "results/func_effects/by_selection/C636-6KE1-A-241212-func-2_func_effects.csv",
    "codon_variants": "results/variants/codon_variants.csv",
    "annotated_mut_summary": "results/annotated_summary_csvs/entry_293T-Mxra8_C636_293T-TIM1_Mxra8-binding_annotated.csv",
    "annotated_site_summary": "results/annotated_summary_csvs/entry_293T-Mxra8_C636_293T-TIM1_Mxra8-binding_annotated_site_means.csv",
    "nb": "notebooks/paper_figures.ipynb",
}
min_times_seen = 2

Python imports:

In [3]:
import itertools
import os

import altair as alt

import dms_variants.codonvarianttable

import numpy

import pandas as pd

import scipy.stats

_ = alt.data_transformers.disable_max_rows()

Distribution of variant functional scores¶

We want the distribution of variant functional scores, similar to as made by this notebook but including both E3-E2 and 6K-E1 fragments and not including deletions (since those are rare in our libraries).

First read all the functional scores, ignoring those for deletions since those are rare and not reported in paper:

In [4]:
def classify_selection(sel):
    sels = {"293T-Mxra8": "293T-Mxra8", "293T-TIM1":"293T-TIM1", "C636":"C6/36"}
    assert sum(s in sel for s in sels) == 1, sel
    for s in sels:
        if s in sel:
            label = [sels[s]]
    libs = {"-A-": "library A", "-B-": "library B"}
    assert sum(l in sel for l in libs) == 1, sel
    for l in libs:
        if l in sel:
            label.append(libs[l])
    return " ".join(label)


func_scores_df = (
    pd.concat(
        [
            pd.read_csv(f).assign(selection=sel)
            for (sel, f) in params.items() if sel.startswith("func_scores_")
        ]
    )
    .assign(selection=lambda x: x["selection"].map(classify_selection))
    .pipe(dms_variants.codonvarianttable.CodonVariantTable.classifyVariants)
    .query("variant_class != 'deletion'")
)

(
    func_scores_df
    .groupby(["selection", "variant_class"])
    .aggregate(n_variants=pd.NamedAgg("barcode", "count"))
)
Out[4]:
n_variants
selection variant_class
293T-Mxra8 library A 1 nonsynonymous 142665
>1 nonsynonymous 79744
stop 5778
synonymous 4245
wildtype 29406
293T-Mxra8 library B 1 nonsynonymous 132342
>1 nonsynonymous 73868
stop 5855
synonymous 4048
wildtype 27337
293T-TIM1 library A 1 nonsynonymous 142805
>1 nonsynonymous 79859
stop 5789
synonymous 4237
wildtype 29411
293T-TIM1 library B 1 nonsynonymous 132443
>1 nonsynonymous 73929
stop 5863
synonymous 4048
wildtype 27349
C6/36 library A 1 nonsynonymous 140546
>1 nonsynonymous 78516
stop 5670
synonymous 4153
wildtype 28913
C6/36 library B 1 nonsynonymous 131133
>1 nonsynonymous 73139
stop 5797
synonymous 3999
wildtype 27056

Make the plot:

In [5]:
def ridgeplot(df):
    variant_classes = list(
        reversed(
            [
                c
                for c in [
                    "wildtype",
                    "synonymous",
                    "1 nonsynonymous",
                    ">1 nonsynonymous",
                    "deletion",
                    "stop",
                ]
                if c in set(df["variant_class"])
            ]
        )
    )

    assert set(df["variant_class"]) == set(variant_classes)

    # get smoothed distribution of functional scores
    bins = numpy.linspace(
        df["func_score"].min(),
        df["func_score"].max(),
        num=50,
    )
    smoothed_dist = pd.concat(
        [
            pd.DataFrame(
                {
                    "selection": sel,
                    "variant_class": var,
                    "func_score": bins,
                    "count": scipy.stats.gaussian_kde(df["func_score"])(bins),
                    "mean_func_score": df["func_score"].mean(),
                    "number of variants": len(df),
                }
            )
            for (sel, var), df in df.groupby(["selection", "variant_class"])
        ]
    )

    # assign y / y2 for plotting
    facet_overlap = 0.7  # maximal facet overlap
    max_count = (smoothed_dist["count"]).max()
    smoothed_dist = smoothed_dist.assign(
        y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
        y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
    )

    # ridgeline plot, based on this but using y / y2 rather than row:
    # https://altair-viz.github.io/gallery/ridgeline_plot.html
    ridgeline_chart = (
        alt.Chart(smoothed_dist)
        .encode(
            x=alt.X(
                "func_score", title="functional score for cell entry", scale=alt.Scale(nice=False)
            ),
            y=alt.Y(
                "y",
                scale=alt.Scale(nice=False),
                title=None,
                axis=alt.Axis(
                    ticks=False,
                    domain=False,
                    # set manual labels https://stackoverflow.com/a/64106056
                    values=[v + 0.5 for v in range(len(variant_classes))],
                    labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
                ),
            ),
            y2=alt.Y2("y2"),
            fill=alt.Fill(
                "mean_func_score:Q",
                title="mean functional score",
                legend=alt.Legend(direction="horizontal"),
                scale=alt.Scale(scheme="yellowgreenblue"),
            ),
            facet=alt.Facet(
                "selection",
                columns=2,
                title=None,
                header=alt.Header(
                    labelFontWeight="bold",
                    labelPadding=0,
                ),
            ),
            tooltip=[
                "selection",
                "variant_class",
                alt.Tooltip(
                    "mean_func_score", format=".2f", title="mean functional score"
                ),
            ],
        )
        .mark_area(
            interpolate="monotone",
            smooth=True,
            fillOpacity=0.8,
            stroke="lightgray",
            strokeWidth=0.5,
        )
        .configure_view(stroke=None)
        .configure_axis(grid=False)
        .properties(width=180, height=22 * len(variant_classes))
    )

    ridgeline_chart = ridgeline_chart.properties(
        autosize=alt.AutoSizeParams(resize=True),
    )

    return ridgeline_chart


func_scores_chart = ridgeplot(func_scores_df)

func_scores_chart
Out[5]:

Number of mutations per variant¶

Plot number of mutations per variant as here but with better axis limits and labels, only keeping the combined libraries:

In [6]:
codon_variants = pd.read_csv(params["codon_variants"])

lib_rename = {"E_A": "library A", "E_B": "library B"}

display(
    codon_variants
    .groupby("library")
    .aggregate(n_variants=pd.NamedAgg("barcode", "count"))
)

max_muts = 4

print(f"Only keeping {lib_rename=}, and clipping at {max_muts=}")

nmuts_dist = (
    codon_variants
    .query("library in @lib_rename")
    .assign(
        library=lambda x: x["library"].map(lib_rename),
        n_muts=lambda x: x["n_aa_substitutions"].clip(upper=max_muts),
    )
    .groupby(["library", "n_muts"], as_index=False)
    .aggregate(n_variants=pd.NamedAgg("barcode", "count"))
    .assign(
        n_muts_label=lambda x: x["n_muts"].map(
            lambda n: str(n) if n < max_muts else f">{n - 1}"
        )
    )
)

nmuts_dist_chart = (
    alt.Chart(nmuts_dist)
    .encode(
        alt.X(
            "n_muts_label",
            sort=alt.SortField("n_muts"),
            title="number amino-acid mutations",
            axis=alt.Axis(labelAngle=0),
        ),
        alt.Y("n_variants", title="number of barcoded variants"),
        alt.Column(
            "library",
            title=None,
            header=alt.Header(labelFontStyle="bold", labelFontSize=11, labelPadding=0),
        ),
    )
    .mark_bar(color="black")
    .configure_axis(grid=False)
    .properties(height=150, width=150)
)

nmuts_dist_chart
n_variants
library
6KE1_A 69359
6KE1_B 62446
E3E2_A 65426
E3E2_B 62495
E_A 134757
E_B 124915
Only keeping lib_rename={'E_A': 'library A', 'E_B': 'library B'}, and clipping at max_muts=4
Out[6]:

Plot correlation among estimated functional effects on each cell¶

Plot similar to here but aggregating across libraries:

In [7]:
func_effects_by_lib = (
    pd.concat(
        [
            pd.read_csv(f).assign(
                name=name,
                cell=name.split("_")[2],
                replicate=name.split("-")[-1],
                library="library A" if "-A-" in name else "library B",
                region="E3E2" if "E3E2" in name else "6KE1",
            )
            for (name, f) in params.items() if name.startswith("func_effects_")
        ],
        ignore_index=True,
    )
    .query("wildtype != mutant")
    .query("times_seen >= @min_times_seen")
    .assign(
        mut_in_region=lambda x: x.apply(
            lambda r: (
                ("6K" in r["site"] or "E1" in r["site"]) and (r["region"] == "6KE1")
                or ("E2" in r["site"] or "E2" in r["site"]) and (r["region"] == "E3E2")
            ),
            axis=1,
        ),
    )
    .query("mut_in_region")
    .assign(
        library_replicate=lambda x: x["library"] + ", replicate " + x["replicate"],
        cell=lambda x: x["cell"].map(
            {
                "C636": "entry in C6/36 cells",
                "293T-Mxra8": "entry in 293T-Mxra8 cells",
                "293T-TIM1": "entry in 293T-TIM1 cells",
            }
        ),
    )
    [["site", "wildtype", "mutant", "functional_effect", "library_replicate", "cell"]]
)

func_effects_by_lib
Out[7]:
site wildtype mutant functional_effect library_replicate cell
1346 1(E2) S - -0.98530 library A, replicate 1 entry in 293T-Mxra8 cells
1347 1(E2) S A 0.24300 library A, replicate 1 entry in 293T-Mxra8 cells
1348 1(E2) S C -1.91700 library A, replicate 1 entry in 293T-Mxra8 cells
1349 1(E2) S D -0.17820 library A, replicate 1 entry in 293T-Mxra8 cells
1350 1(E2) S E -0.71480 library A, replicate 1 entry in 293T-Mxra8 cells
... ... ... ... ... ... ...
306391 439(E1) H Y -0.37860 library A, replicate 2 entry in C6/36 cells
306393 440(E1) * K 0.06401 library A, replicate 2 entry in C6/36 cells
306394 440(E1) * Q -2.44000 library A, replicate 2 entry in C6/36 cells
306395 440(E1) * W -0.49750 library A, replicate 2 entry in C6/36 cells
306396 440(E1) * Y -0.23450 library A, replicate 2 entry in C6/36 cells

210846 rows × 6 columns

Now make the plot:

In [8]:
for cell, cell_df in func_effects_by_lib.groupby("cell"):
    corr_panels = []
    for sel1, sel2 in itertools.combinations(sorted(cell_df["library_replicate"].unique()), 2):
        corr_df = (
            cell_df.query("library_replicate == @sel1")[["functional_effect", "site", "mutant"]]
            .rename(columns={"functional_effect": sel1})
            .merge(
                cell_df.query("library_replicate == @sel2")[["functional_effect", "site", "mutant"]].rename(
                    columns={"functional_effect": sel2}
                ),
                validate="one_to_one",
            )
            .drop(columns=["site", "mutant"])
        )
        n = len(corr_df)
        r = corr_df[[sel1, sel2]].corr().values[1, 0]
        corr_panels.append(
            alt.Chart(corr_df)
            .encode(
                alt.X(sel1, scale=alt.Scale(nice=False, padding=4)),
                alt.Y(sel2, scale=alt.Scale(nice=False, padding=4)),
            )
            .mark_circle(color="black", size=25, opacity=0.15)
            .properties(
                width=135,
                height=135,
                title=alt.TitleParams(
                    f"R = {r:.2f}, N = {n}", fontSize=11, fontWeight="normal", dy=2
                ),
            )
        )
    
    corr_chart = alt.hconcat(*corr_panels, spacing=5).configure_axis(grid=False).properties(
        title=alt.TitleParams(f"correlations among libraries and replicates for mutations effects on {cell}", anchor="middle")
    )
    display(corr_chart)
In [ ]: